HW 03

Author

Meredith Casella Jean-Baptiste

#SETUP for THEMES 
if (!require("pacman"))
  install.packages("pacman")
Loading required package: pacman
pacman::p_load(here)
pacman::p_load(tidyverse, colorspace, palmerpenguins, fs, lubridate, scales, openintro, gghighlight, glue, ggridges, dplyr, tidyr, forcats, dsbox, ggplot2)
devtools::install_github("tidyverse/dsbox")
Skipping install of 'dsbox' from a github remote, the SHA1 (244ecdfe) has not changed since last install.
  Use `force = TRUE` to force installation
library(dsbox)

ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

options(width = 65)

knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = 0.618,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)

1 - Du Bois challenge.

  if (!require("pacman"))
  install.packages("pacman")
pacman::p_load(here)
pacman::p_load(tidyverse, janitor, patchwork, extrafont, pBrackets, colorspace, palmerpenguins, fs, lubridate, scales, openintro, gghighlight, glue, ggridges, dplyr, tidyr, forcats, ggimage, grid, png)

  income <-read_csv(here("data", "income.csv")) %>% 
    janitor::clean_names()
Rows: 7 Columns: 7
── Column specification ─────────────────────────────────────────
Delimiter: ","
chr (1): Class
dbl (6): Average_Income, Rent, Food, Clothes, Tax, Other

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  #I tried to add in the parchment paper image for the assignment but could not get this to work
#  img <- readPNG(here("Parchment_paper.png"))
  
  income <- income %>% 
    pivot_longer(cols = c('rent', 'food', 'clothes', 'tax', 'other'),
                 names_to = "category",
                 values_to = "percentage")
  income %>% 
    ggplot(aes(x = percentage, y = class, fill = category)) +
    geom_bar(stat = "identity", color = "white")+
    #unsuccessful in adding background image from png file
#    annotation_custom(rasterGrob(img)) +
    scale_fill_manual(
      values = c("rent" = "black", "food" = "#B19cd9", "clothes" = "#ffc0cb", "tax" = "#a1a1a1", "other" = "#e9e9e9")) +
    #percentage labels not lining up well nor with % symbols
    geom_text(aes(label = percentage), 
              position = position_stack(reverse = TRUE,vjust = 0.1)) +
    theme(legend.position = "none") +
      labs(
        y = "",
        x = "",
        title = "INCOME AND EXPENDITURE OF 150 NEGRO FAMILIES IN ATLANTA, GA., U.S.A.")+
    theme(plot.title = element_text(hjust = 0.2))

2 - COVID survey - interpret

# INTERPRETATION
#Your task for this question is to take a close look at this plot and interpret it. There is a lot going on here, which is customary for plots that go into scientific publications – they tend to be very information dense, for better or worse… As you interpret it, discuss if there are any results that agree or disagree with your intuition. There is a lot you can say, but we don’t need you to be exhaustive. Please provide three concrete examples.
#The plot overall shows a few interesting points about the Covid-19 vaccine. 
#First, there is a wide variety in whether people think the vaccine is safe, with the mean agreeing but a wide percentile spread in the responses.
#Second, overall participants agreed that they would feel safer at work if they were vaccinated with >30 and < 20 year old groups feeling less so, and most interestingly, those who are unvaccinated, Native Americans, and those who preferred not to disclose their gender, would feel less safer at work, where there was more of a wide percentile spread in the responses.
#Third, there is an overall concern for side effects and vaccine safety across almost all groups
#Fourth, the majority of participants are both confident in and trust the information they have received on the vaccines with the exception of the unvaccinated, those who had the flu vaccine, Native Americans, and those who preferred not to disclose their gender groups.
#Lastly, almost all groups of participants would recommend the vaccine to family or friends with the exception of the unvaccinated; however, Native Americans and those who preferred not to disclose their gender had a wider percentile spread in their responses.

3 - COVID survey - reconstruct

#devtools::install_github("tidyverse/dsbox")
#library(dsbox)
covid_survey <-read_csv(here("data", "covid-survey.csv"))
New names:
Rows: 1122 Columns: 14
── Column specification
───────────────────────────────────────── Delimiter: "," chr
(14): likert_survey, ...2, ...3, ...4, ...5, ...6, ...7, ...
ℹ Use `spec()` to retrieve the full column specification for
this data. ℹ Specify the column types or set `show_col_types =
FALSE` to quiet this message.
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
  # Rename columns to first-row names
colnames(covid_survey)[2] = "exp_profession"
colnames(covid_survey)[1] = "response_id"
colnames(covid_survey)[3] = "exp_flu_vax"
colnames(covid_survey)[4] = "exp_gender"
colnames(covid_survey)[5] = "exp_race"
colnames(covid_survey)[6] = "exp_ethnicity"
colnames(covid_survey)[7] = "exp_age_bin"
colnames(covid_survey)[8] = "exp_already_vax"
colnames(covid_survey)[9] = "resp_safety"
colnames(covid_survey)[10] = "resp_confidence_science"
colnames(covid_survey)[11] = "resp_concern_safety"
colnames(covid_survey)[12] = "resp_feel_safe_at_work"
colnames(covid_survey)[13] = "resp_will_recommend"
colnames(covid_survey)[14] = "resp_trust_info"
dim(covid_survey)
[1] 1122   14
#remove first row
covid_survey <- covid_survey[-1, ]
#remove only rows with all NA, the first version did not work
#covid_survey |>
#  filter(if_all(everything(), ~!is.na(.x)))
covid_survey <- covid_survey[!(is.na(covid_survey$exp_profession)) | !(is.na(covid_survey$exp_flu_vax)) | !(is.na(covid_survey$exp_gender)) | !(is.na(covid_survey$exp_race)) | !(is.na(covid_survey$exp_ethnicity)) | !(is.na(covid_survey$exp_age_bin)) | !(is.na(covid_survey$exp_already_vax)) | !(is.na(covid_survey$resp_safety)) | !(is.na(covid_survey$resp_confidence_science)) | !(is.na(covid_survey$resp_concern_safety)) | !(is.na(covid_survey$resp_feel_safe_at_work)) | !(is.na(covid_survey$resp_will_recommend)) | !(is.na(covid_survey$resp_trust_info)),]

#convert chr to dbl
covid_survey$resp_safety <- as.numeric(as.character(covid_survey$resp_safety)) 
covid_survey$resp_confidence_science <- as.numeric(as.character(covid_survey$resp_confidence_science)) 
covid_survey$resp_concern_safety <- as.numeric(as.character(covid_survey$resp_concern_safety)) 
covid_survey$resp_feel_safe_at_work <- as.numeric(as.character(covid_survey$resp_feel_safe_at_work)) 
covid_survey$resp_will_recommend <- as.numeric(as.character(covid_survey$resp_will_recommend)) 
covid_survey$resp_trust_info <- as.numeric(as.character(covid_survey$resp_trust_info)) 

library(dplyr)
covid_survey <- covid_survey %>% 
  mutate(
  exp_already_vax=recode(exp_already_vax, '0'='No', '1'='Yes'),
  exp_flu_vax=recode(exp_flu_vax, '0'='No', '1'='Yes'),
  exp_profession=recode(exp_profession, '0'='Medical', '1'='Nursing'),
  exp_gender=recode(exp_gender, '0'='Male', '1'='Female', '3' = 'Non-binary third gender', '4' = 'Prefer not to say'),
  exp_race=recode(exp_race, '1'='American Indian/ Alaskan Native', '2'='Asian', '3'='Black/ African American', '4'='Native Hawaiian/ Other', '5' = 'White'),
  exp_ethnicity=recode(exp_ethnicity, '1'='Hispanic/ Latino', '2'='Non-Hispanic/ Non-Latino'),
  exp_age_bin=recode(exp_age_bin, '0'='<20', '20'='21-25', '25'='26-30', '30'='>30' ))

#NOTE:
# The first pivot-longer is pivoting the explanatory values (types of categories created such as profession, gender, age category, etc.) and the values for these.
# The second pivot-longer is pivoting the responses according to category of participants such as those who were vaccinated or not, etc.
# Together we will have a long table of values per category which we will now be able to visualize and calculate the mean for each.
library(tidyr)
covid_survey_longer <- covid_survey |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )
# Grouped and calculate summary statistics for each response variable by explanatory variables
covid_survey_stats_by_group <- covid_survey_longer %>%
  group_by(explanatory, explanatory_value, response) %>% 
  dplyr::summarise("low" = quantile(response_value, probs = .1, na.rm = TRUE),
            "high" = quantile(response_value, probs = .9, na.rm = TRUE),
            mean_response = mean(response_value),
            .groups = "drop")   

# Grouped and calculate summary statistics for each response variable not conditioned on explanatory variables
covid_survey_summary_stats_all <- covid_survey_longer %>% 
  group_by(response) %>% 
  dplyr::summarise("low" = quantile(response_value, probs = .1, na.rm = TRUE),
            "high" = quantile(response_value, probs = .9, na.rm = TRUE),
            mean_response = mean(response_value),
            .groups = "drop") 


# Add explanatory and explanatory_value to the overall summary to match grouped summary
covid_survey_summary_stats_all <- covid_survey_summary_stats_all %>%
  mutate(
    explanatory = "All",
    explanatory_value = "All"
  ) %>%
  select(explanatory, explanatory_value, response, low, high, mean_response)

# Bind the two
# Binding them by row
covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_all,
  covid_survey_stats_by_group
)
#recode the data labels
covid_survey_summary_stats <- covid_survey_summary_stats %>%
  mutate(response = recode(response,
    "resp_safety"               = "Based on my understanding, I believe the vaccine is safe",
    "resp_confidence_science"  = "I trust the science behind the vaccine",
    "resp_concern_safety"      = "I am concerned about the safety of the vaccine",
    "resp_feel_safe_at_work"   = "The vaccine will make me feel safe at work",
    "resp_will_recommend"      = "I will recommend the vaccine to others",
    "resp_trust_info"          = "I trust the information I’ve received about the vaccine"
  ))
#recode the explanatory variables
covid_survey_summary_stats <- covid_survey_summary_stats %>%
  mutate(explanatory = recode(explanatory,
    "exp_profession"      = "Profession",
    "exp_flu_vax"         = "Received Flu Vaccine",
    "exp_gender"          = "Gender",
    "exp_race"            = "Race",
    "exp_ethnicity"       = "Ethnicity",
    "exp_age_bin"         = "Age Group",
    "exp_already_vax"     = "Already Vaccinated"
  ))

#graph the data; I could not get the response codes across the top nor the explanatory variables on the right
ggplot(covid_survey_summary_stats, aes(y = explanatory_value, x = mean_response)) +
  geom_point(color = "steelblue", size = 1) +
  geom_errorbar(aes(xmin = low, xmax = high), width = 0.2, color = "steelblue") +
  facet_grid(response ~ explanatory,
             labeller = labeller(response = label_wrap_gen(15))) +
  theme_void() +
  theme(
    strip.placement = "outside",
    strip.background = element_blank(),
    strip.text.y.right = element_text(angle = 0), 
    strip.text.x.top = element_text(angle = 0),
    axis.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 10)
  ) +
  labs(
    x = NULL,y = NULL
  )
Warning: Removed 66 rows containing missing values or values outside the
scale range (`geom_point()`).

4 - COVID survey - re-reconstruct

covid_survey <-read_csv(here("data", "covid-survey.csv"))
New names:
Rows: 1122 Columns: 14
── Column specification
───────────────────────────────────────── Delimiter: "," chr
(14): likert_survey, ...2, ...3, ...4, ...5, ...6, ...7, ...
ℹ Use `spec()` to retrieve the full column specification for
this data. ℹ Specify the column types or set `show_col_types =
FALSE` to quiet this message.
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
  # Rename columns to first-row names
colnames(covid_survey)[2] = "exp_profession"
colnames(covid_survey)[1] = "response_id"
colnames(covid_survey)[3] = "exp_flu_vax"
colnames(covid_survey)[4] = "exp_gender"
colnames(covid_survey)[5] = "exp_race"
colnames(covid_survey)[6] = "exp_ethnicity"
colnames(covid_survey)[7] = "exp_age_bin"
colnames(covid_survey)[8] = "exp_already_vax"
colnames(covid_survey)[9] = "resp_safety"
colnames(covid_survey)[10] = "resp_confidence_science"
colnames(covid_survey)[11] = "resp_concern_safety"
colnames(covid_survey)[12] = "resp_feel_safe_at_work"
colnames(covid_survey)[13] = "resp_will_recommend"
colnames(covid_survey)[14] = "resp_trust_info"
dim(covid_survey)
[1] 1122   14
#remove first row
covid_survey <- covid_survey[-1, ]
#remove only rows with all NA, the first version did not work
#covid_survey |>
#  filter(if_all(everything(), ~!is.na(.x)))
covid_survey <- covid_survey[!(is.na(covid_survey$exp_profession)) | !(is.na(covid_survey$exp_flu_vax)) | !(is.na(covid_survey$exp_gender)) | !(is.na(covid_survey$exp_race)) | !(is.na(covid_survey$exp_ethnicity)) | !(is.na(covid_survey$exp_age_bin)) | !(is.na(covid_survey$exp_already_vax)) | !(is.na(covid_survey$resp_safety)) | !(is.na(covid_survey$resp_confidence_science)) | !(is.na(covid_survey$resp_concern_safety)) | !(is.na(covid_survey$resp_feel_safe_at_work)) | !(is.na(covid_survey$resp_will_recommend)) | !(is.na(covid_survey$resp_trust_info)),]

#convert chr to dbl
covid_survey$resp_safety <- as.numeric(as.character(covid_survey$resp_safety)) 
covid_survey$resp_confidence_science <- as.numeric(as.character(covid_survey$resp_confidence_science)) 
covid_survey$resp_concern_safety <- as.numeric(as.character(covid_survey$resp_concern_safety)) 
covid_survey$resp_feel_safe_at_work <- as.numeric(as.character(covid_survey$resp_feel_safe_at_work)) 
covid_survey$resp_will_recommend <- as.numeric(as.character(covid_survey$resp_will_recommend)) 
covid_survey$resp_trust_info <- as.numeric(as.character(covid_survey$resp_trust_info)) 

library(dplyr)
covid_survey <- covid_survey %>% 
  mutate(
  exp_already_vax=recode(exp_already_vax, '0'='No', '1'='Yes'),
  exp_flu_vax=recode(exp_flu_vax, '0'='No', '1'='Yes'),
  exp_profession=recode(exp_profession, '0'='Medical', '1'='Nursing'),
  exp_gender=recode(exp_gender, '0'='Male', '1'='Female', '3' = 'Non-binary third gender', '4' = 'Prefer not to say'),
  exp_race=recode(exp_race, '1'='American Indian/ Alaskan Native', '2'='Asian', '3'='Black/ African American', '4'='Native Hawaiian/ Other', '5' = 'White'),
  exp_ethnicity=recode(exp_ethnicity, '1'='Hispanic/ Latino', '2'='Non-Hispanic/ Non-Latino'),
  exp_age_bin=recode(exp_age_bin, '0'='<20', '20'='21-25', '25'='26-30', '30'='>30' ))

#NOTE:
# The first pivot-longer is pivoting the explanatory values (types of categories created such as profession, gender, age category, etc.) and the values for these.
# The second pivot-longer is pivoting the responses according to category of participants such as those who were vaccinated or not, etc.
# Together we will have a long table of values per category which we will now be able to visualize and calculate the mean for each.
library(tidyr)
covid_survey_longer <- covid_survey |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )
# Grouped and calculate summary statistics for each response variable by explanatory variables
covid_survey_stats_by_group <- covid_survey_longer %>%
  group_by(explanatory, explanatory_value, response) %>% 
  dplyr::summarise("low" = quantile(response_value, probs = .25, na.rm = TRUE),
            "high" = quantile(response_value, probs = .75, na.rm = TRUE),
            mean_response = mean(response_value),
            .groups = "drop")   

# Grouped and calculate summary statistics for each response variable not conditioned on explanatory variables
covid_survey_summary_stats_all <- covid_survey_longer %>% 
  group_by(response) %>% 
  dplyr::summarise("low" = quantile(response_value, probs = .25, na.rm = TRUE),
            "high" = quantile(response_value, probs = .75, na.rm = TRUE),
            mean_response = mean(response_value),
            .groups = "drop") 


# Add explanatory and explanatory_value to the overall summary to match grouped summary
covid_survey_summary_stats_all <- covid_survey_summary_stats_all %>%
  mutate(
    explanatory = "All",
    explanatory_value = "All"
  ) %>%
  select(explanatory, explanatory_value, response, low, high, mean_response)

# Bind the two
# Binding them by row
covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_all,
  covid_survey_stats_by_group
)
#recode the data labels
covid_survey_summary_stats <- covid_survey_summary_stats %>%
  mutate(response = recode(response,
    "resp_safety"               = "Based on my understanding, I believe the vaccine is safe",
    "resp_confidence_science"  = "I trust the science behind the vaccine",
    "resp_concern_safety"      = "I am concerned about the safety of the vaccine",
    "resp_feel_safe_at_work"   = "The vaccine will make me feel safe at work",
    "resp_will_recommend"      = "I will recommend the vaccine to others",
    "resp_trust_info"          = "I trust the information I’ve received about the vaccine"
  ))
#recode the explanatory variables
covid_survey_summary_stats <- covid_survey_summary_stats %>%
  mutate(explanatory = recode(explanatory,
    "exp_profession"      = "Profession",
    "exp_flu_vax"         = "Received Flu Vaccine",
    "exp_gender"          = "Gender",
    "exp_race"            = "Race",
    "exp_ethnicity"       = "Ethnicity",
    "exp_age_bin"         = "Age Group",
    "exp_already_vax"     = "Already Vaccinated"
  ))

#graph the data; I could not get the response codes across the top nor the explanatory variables on the right
ggplot(covid_survey_summary_stats, aes(y = explanatory_value, x = mean_response)) +
  geom_point(color = "steelblue", size = 1) +
  geom_errorbar(aes(xmin = low, xmax = high), width = 0.2, color = "steelblue") +
  facet_grid(response ~ explanatory,
             labeller = labeller(response = label_wrap_gen(15))) +
  theme_void() +
  theme(
    strip.placement = "outside",
    strip.background = element_blank(),
    strip.text.y.right = element_text(angle = 0), 
    strip.text.x.top = element_text(angle = 0),
    axis.text.y = element_text(size = 8),
    axis.text.x = element_text(size = 8)
  ) +
  labs(
    x = NULL,y = NULL
  )
Warning: Removed 66 rows containing missing values or values outside the
scale range (`geom_point()`).

5 - COVID survey - 5a: Diverging barchart 100%

covid_survey <-read_csv(here("data", "covid-survey.csv"))
New names:
Rows: 1122 Columns: 14
── Column specification
───────────────────────────────────────── Delimiter: "," chr
(14): likert_survey, ...2, ...3, ...4, ...5, ...6, ...7, ...
ℹ Use `spec()` to retrieve the full column specification for
this data. ℹ Specify the column types or set `show_col_types =
FALSE` to quiet this message.
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
  # Rename columns to first-row names
colnames(covid_survey)[2] = "exp_profession"
colnames(covid_survey)[1] = "response_id"
colnames(covid_survey)[3] = "exp_flu_vax"
colnames(covid_survey)[4] = "exp_gender"
colnames(covid_survey)[5] = "exp_race"
colnames(covid_survey)[6] = "exp_ethnicity"
colnames(covid_survey)[7] = "exp_age_bin"
colnames(covid_survey)[8] = "exp_already_vax"
colnames(covid_survey)[9] = "resp_safety"
colnames(covid_survey)[10] = "resp_confidence_science"
colnames(covid_survey)[11] = "resp_concern_safety"
colnames(covid_survey)[12] = "resp_feel_safe_at_work"
colnames(covid_survey)[13] = "resp_will_recommend"
colnames(covid_survey)[14] = "resp_trust_info"
dim(covid_survey)
[1] 1122   14
#remove first row
covid_survey <- covid_survey[-1, ]
#remove only rows with all NA, the first version did not work
#covid_survey |>
#  filter(if_all(everything(), ~!is.na(.x)))
covid_survey <- covid_survey[!(is.na(covid_survey$exp_profession)) | !(is.na(covid_survey$exp_flu_vax)) | !(is.na(covid_survey$exp_gender)) | !(is.na(covid_survey$exp_race)) | !(is.na(covid_survey$exp_ethnicity)) | !(is.na(covid_survey$exp_age_bin)) | !(is.na(covid_survey$exp_already_vax)) | !(is.na(covid_survey$resp_safety)) | !(is.na(covid_survey$resp_confidence_science)) | !(is.na(covid_survey$resp_concern_safety)) | !(is.na(covid_survey$resp_feel_safe_at_work)) | !(is.na(covid_survey$resp_will_recommend)) | !(is.na(covid_survey$resp_trust_info)),]

#convert chr to dbl
covid_survey$resp_safety <- as.numeric(as.character(covid_survey$resp_safety)) 
covid_survey$resp_confidence_science <- as.numeric(as.character(covid_survey$resp_confidence_science)) 
covid_survey$resp_concern_safety <- as.numeric(as.character(covid_survey$resp_concern_safety)) 
covid_survey$resp_feel_safe_at_work <- as.numeric(as.character(covid_survey$resp_feel_safe_at_work)) 
covid_survey$resp_will_recommend <- as.numeric(as.character(covid_survey$resp_will_recommend)) 
covid_survey$resp_trust_info <- as.numeric(as.character(covid_survey$resp_trust_info)) 

library(dplyr)
covid_survey <- covid_survey %>% 
  mutate(
  exp_already_vax=recode(exp_already_vax, '0'='No', '1'='Yes'),
  exp_flu_vax=recode(exp_flu_vax, '0'='No', '1'='Yes'),
  exp_profession=recode(exp_profession, '0'='Medical', '1'='Nursing'),
  exp_gender=recode(exp_gender, '0'='Male', '1'='Female', '3' = 'Non-binary third gender', '4' = 'Prefer not to say'),
  exp_race=recode(exp_race, '1'='American Indian/ Alaskan Native', '2'='Asian', '3'='Black/ African American', '4'='Native Hawaiian/ Other', '5' = 'White'),
  exp_ethnicity=recode(exp_ethnicity, '1'='Hispanic/ Latino', '2'='Non-Hispanic/ Non-Latino'),
  exp_age_bin=recode(exp_age_bin, '0'='<20', '20'='21-25', '25'='26-30', '30'='>30' ))


library(tidyr)
covid_survey_longer <- covid_survey |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )

#recode the data labels
covid_survey_longer <- covid_survey_longer %>%
  mutate(response = recode(response,
    "resp_safety"               = "Based on my understanding, I believe the vaccine is safe",
    "resp_confidence_science"  = "I trust the science behind the vaccine",
    "resp_concern_safety"      = "I am concerned about the safety of the vaccine",
    "resp_feel_safe_at_work"   = "The vaccine will make me feel safe at work",
    "resp_will_recommend"      = "I will recommend the vaccine to others",
    "resp_trust_info"          = "I trust the information I’ve received about the vaccine"
  ))


likert <- covid_survey_longer %>%
  filter(!is.na(response_value)) %>%
  mutate(
    response_score = recode(response_value,
      "Approve" = 1,
      "Slightly Approve" = 2,
      "Neutral" = 3,
      "Slightly Disapprove" = 4,
      "Disapprove" = 5
    ),
    positive_negative = ifelse(response_score <= 3, "Positive", "Negative")
  ) %>%
  count(response, positive_negative) %>%
  group_by(response) %>%
  mutate(percent = 100 * n / sum(n)) %>%
  ungroup() %>%
  mutate(percent = ifelse(positive_negative == "Negative", -percent, percent))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `response_score = recode(...)`.
Caused by warning in `recode.numeric()`:
! NAs introduced by coercion
#Plot diverging bar chart
likert_plot <- ggplot(likert, aes(x = response, y = percent, fill = positive_negative)) +
  geom_col() +
  scale_y_continuous(labels = abs) +
  scale_fill_manual(values = c("Positive" = "#1b9e77", "Negative" = "#d95f02")) +
  labs(
    x = NULL,
    y = "Percentage",
    fill = "Response Sentiment",
    title = "Covid-19 vaccination attitudes and opinions survey:\nResponses by percentage of approval",
    caption = "Pavan Shah, Giorgio Caturegli, Galen Shi, and Joshua Materi at Johns Hopkins School of Medicine"
  ) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))

# display the plot
likert_plot

5 - COVID survey - 5b: Stacked barchart 100%

covid_survey <-read_csv(here("data", "covid-survey.csv"))
New names:
Rows: 1122 Columns: 14
── Column specification
───────────────────────────────────────── Delimiter: "," chr
(14): likert_survey, ...2, ...3, ...4, ...5, ...6, ...7, ...
ℹ Use `spec()` to retrieve the full column specification for
this data. ℹ Specify the column types or set `show_col_types =
FALSE` to quiet this message.
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`
• `` -> `...7`
• `` -> `...8`
• `` -> `...9`
• `` -> `...10`
• `` -> `...11`
• `` -> `...12`
• `` -> `...13`
• `` -> `...14`
  # Rename columns to first-row names
colnames(covid_survey)[2] = "exp_profession"
colnames(covid_survey)[1] = "response_id"
colnames(covid_survey)[3] = "exp_flu_vax"
colnames(covid_survey)[4] = "exp_gender"
colnames(covid_survey)[5] = "exp_race"
colnames(covid_survey)[6] = "exp_ethnicity"
colnames(covid_survey)[7] = "exp_age_bin"
colnames(covid_survey)[8] = "exp_already_vax"
colnames(covid_survey)[9] = "resp_safety"
colnames(covid_survey)[10] = "resp_confidence_science"
colnames(covid_survey)[11] = "resp_concern_safety"
colnames(covid_survey)[12] = "resp_feel_safe_at_work"
colnames(covid_survey)[13] = "resp_will_recommend"
colnames(covid_survey)[14] = "resp_trust_info"
dim(covid_survey)
[1] 1122   14
#remove first row
covid_survey <- covid_survey[-1, ]
#remove only rows with all NA, the first version did not work
#covid_survey |>
#  filter(if_all(everything(), ~!is.na(.x)))
covid_survey <- covid_survey[!(is.na(covid_survey$exp_profession)) | !(is.na(covid_survey$exp_flu_vax)) | !(is.na(covid_survey$exp_gender)) | !(is.na(covid_survey$exp_race)) | !(is.na(covid_survey$exp_ethnicity)) | !(is.na(covid_survey$exp_age_bin)) | !(is.na(covid_survey$exp_already_vax)) | !(is.na(covid_survey$resp_safety)) | !(is.na(covid_survey$resp_confidence_science)) | !(is.na(covid_survey$resp_concern_safety)) | !(is.na(covid_survey$resp_feel_safe_at_work)) | !(is.na(covid_survey$resp_will_recommend)) | !(is.na(covid_survey$resp_trust_info)),]

#convert chr to dbl
covid_survey$resp_safety <- as.numeric(as.character(covid_survey$resp_safety)) 
covid_survey$resp_confidence_science <- as.numeric(as.character(covid_survey$resp_confidence_science)) 
covid_survey$resp_concern_safety <- as.numeric(as.character(covid_survey$resp_concern_safety)) 
covid_survey$resp_feel_safe_at_work <- as.numeric(as.character(covid_survey$resp_feel_safe_at_work)) 
covid_survey$resp_will_recommend <- as.numeric(as.character(covid_survey$resp_will_recommend)) 
covid_survey$resp_trust_info <- as.numeric(as.character(covid_survey$resp_trust_info)) 

library(dplyr)
covid_survey <- covid_survey %>% 
  mutate(
  exp_already_vax=recode(exp_already_vax, '0'='No', '1'='Yes'),
  exp_flu_vax=recode(exp_flu_vax, '0'='No', '1'='Yes'),
  exp_profession=recode(exp_profession, '0'='Medical', '1'='Nursing'),
  exp_gender=recode(exp_gender, '0'='Male', '1'='Female', '3' = 'Non-binary third gender', '4' = 'Prefer not to say'),
  exp_race=recode(exp_race, '1'='American Indian/ Alaskan Native', '2'='Asian', '3'='Black/ African American', '4'='Native Hawaiian/ Other', '5' = 'White'),
  exp_ethnicity=recode(exp_ethnicity, '1'='Hispanic/ Latino', '2'='Non-Hispanic/ Non-Latino'),
  exp_age_bin=recode(exp_age_bin, '0'='<20', '20'='21-25', '25'='26-30', '30'='>30' ))


library(tidyr)
covid_survey_longer <- covid_survey |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )

#recode the data labels
covid_survey_longer <- covid_survey_longer %>%
  mutate(response = recode(response,
    "resp_safety"               = "Based on my understanding, I believe the vaccine is safe",
    "resp_confidence_science"  = "I trust the science behind the vaccine",
    "resp_concern_safety"      = "I am concerned about the safety of the vaccine",
    "resp_feel_safe_at_work"   = "The vaccine will make me feel safe at work",
    "resp_will_recommend"      = "I will recommend the vaccine to others",
    "resp_trust_info"          = "I trust the information I’ve received about the vaccine"
  ))

#from the pivot longer we will look at the likert resonses per question
likert_percentages <- covid_survey_longer %>%
  filter(!is.na(response_value)) %>%
  group_by(response, response_value) %>%
  summarise(count = n(), .groups = "drop") %>% 
  group_by(response) %>%
  mutate(percentage = count / sum(count) * 100)


#Recode Likert values
likert_percentages$response_value <- factor(likert_percentages$response_value,
    levels = 1:5,
    labels = c("Approve", "Slightly Approve", "Neutral", "Slightly Disapprove", "Disapprove"))

#Plot stacked bar chart
ggplot(likert_percentages, aes(x = response, y = percentage, fill = response_value)) +
  geom_col() +
  scale_fill_brewer(palette = "Viridis", direction = -1, name = "Response") +
  labs(x = NULL, y = "Percentage", title = "Covid-19 vaccination attitudes and opinions survey: \nResponses by percentage of approval", citation = "Pavan Shah, Giorgio Caturegli, Galen Shi, and Joshua Materi at Johns Hopkins School of Medicine") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))
Warning: Unknown palette: "Viridis"

Citations

#1: #DuBois challenge from tidy_tuesday #, how to create a stacked bar plot https://github.com/marisalyn/tidy_tuesday/blob/master/20210216_dubois.Rmd https://stackoverflow.com/questions/6851522/how-do-i-plot-a-stacked-bar-with-ggplot

#3: How to recode in R: https://www.statology.org/recode-dplyr/ #remove rows with all NA: https://stackoverflow.com/questions/4862178/remove-rows-with-all-or-some-nas-missing-values-in-data-frame #omit rows in ggplot: https://stackoverflow.com/questions/61899943/how-do-i-omit-rows-in-a-ggplot #renaming columns in R; https://sparkbyexamples.com/r-programming/rename-column-in-r/ #calculating the mean in R: https://www.statology.org/r-mean-by-group/ #percentiles in R: https://www.geeksforgeeks.org/r-language/how-to-calculate-percentiles-in-r/ #chatGPT : tried using chatGPT after several hours on my own to get the ggplot to be re-oriented without success

#4:

#5: #how to create likert stacked barchart: https://stackoverflow.com/questions/67196796/plot-stacked-bar-chart-of-likert-variables-in-r